{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "74cdb1b7-5c47-4cc0-95f5-1fa3e3b88e93",
   "metadata": {},
   "source": [
    "# spark读取proto文件\n",
    "https://www.aidoczh.com/spark/sql-data-sources-protobuf.html\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "96585db6-5241-4e55-81fa-14f38a712061",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "25/07/14 16:59:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
      "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
      "25/07/14 16:59:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from pyspark.sql import SparkSession\n",
    "os.environ['JAVA_HOME'] = '/opt/develop/soft/java/corretto-1.8.0_452/Contents/Home'\n",
    "spark = SparkSession\\\n",
    "        .builder\\\n",
    "        .appName(\"jb-20250714\")\\\n",
    "        .getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "392f7bff-840b-43fa-abc0-3d4181d48cef",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- dt: string (nullable = true)\n",
      " |-- pkg_name: string (nullable = true)\n",
      " |-- bid_event_name: string (nullable = true)\n",
      " |-- partner: string (nullable = true)\n",
      " |-- media_source: string (nullable = true)\n",
      " |-- geo: string (nullable = true)\n",
      " |-- campaign_name: string (nullable = true)\n",
      " |-- clicks: long (nullable = true)\n",
      " |-- installs: long (nullable = true)\n",
      " |-- bid_event_count: long (nullable = true)\n",
      " |-- receive_time: long (nullable = true)\n",
      " |-- pay_in: decimal(12,6) (nullable = true)\n",
      " |-- pay_out: decimal(12,6) (nullable = true)\n",
      " |-- campaign_bid_type: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "file_path='/opt/develop/tmp/data/raw-20250714/20250713.gz.parquet'\n",
    "spark.read.parquet(file_path).printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7e75dc77-a880-4378-ab40-3d418690a2ab",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- dt: string (nullable = true)\n",
      " |-- pkg_name: string (nullable = true)\n",
      " |-- bid_event_name: string (nullable = true)\n",
      " |-- partner: string (nullable = true)\n",
      " |-- media_source: string (nullable = true)\n",
      " |-- geo: string (nullable = true)\n",
      " |-- campaign_name: string (nullable = true)\n",
      " |-- clicks: long (nullable = true)\n",
      " |-- installs: long (nullable = true)\n",
      " |-- bid_event_count: long (nullable = true)\n",
      " |-- receive_time: long (nullable = true)\n",
      " |-- pay_in: decimal(12,6) (nullable = true)\n",
      " |-- pay_out: decimal(12,6) (nullable = true)\n",
      " |-- campaign_bid_type: string (nullable = true)\n",
      "\n",
      "+--------+--------------------+--------------+-------+---------------+---+--------------------+------+--------+---------------+------------+--------+--------+-----------------+\n",
      "|      dt|            pkg_name|bid_event_name|partner|   media_source|geo|       campaign_name|clicks|installs|bid_event_count|receive_time|  pay_in| pay_out|campaign_bid_type|\n",
      "+--------+--------------------+--------------+-------+---------------+---+--------------------+------+--------+---------------+------------+--------+--------+-----------------+\n",
      "|20250713|com.finaccel.android|          None|shareit|    kakjaed_int| AD|kakjaed_KREDIVO_A...|     1|       0|              0|  1752461394|0.000000|0.000000|              CPE|\n",
      "|20250713|com.finaccel.android|          None|shareit|   axdsygpk_int| AE|axdsygpk_KREDIVO_...|   614|       0|              0|  1752461394|0.000000|0.000000|              CPE|\n",
      "|20250713|com.finaccel.android|          None|shareit|    kakjaed_int| AE|kakjaed_KREDIVO_A...|  1106|       0|              0|  1752461394|0.000000|0.000000|              CPE|\n",
      "|20250713|com.finaccel.android|          None|shareit|mimimobil5r_int| AE|mimimobil5r_KREDI...|  1034|       0|              0|  1752461394|0.000000|0.000000|              CPE|\n",
      "|20250713|com.finaccel.android|          None|shareit|    ribfvju_int| AE|ribfvju_KREDIVO_A...|   639|       0|              0|  1752461394|0.000000|0.000000|              CPE|\n",
      "|20250713|com.finaccel.android|          None|shareit|    shareit_int| AE|                None|     4|       0|              0|  1752461394|0.000000|0.000000|                 |\n",
      "|20250713|com.finaccel.android|          None|shareit|   axdsygpk_int| AF|axdsygpk_KREDIVO_...|    83|       0|              0|  1752461394|0.000000|0.000000|              CPE|\n",
      "|20250713|com.finaccel.android|          None|shareit|    kakjaed_int| AF|kakjaed_KREDIVO_A...|   145|       0|              0|  1752461394|0.000000|0.000000|              CPE|\n",
      "|20250713|com.finaccel.android|          None|shareit|mimimobil5r_int| AF|mimimobil5r_KREDI...|   126|       0|              0|  1752461394|0.000000|0.000000|              CPE|\n",
      "|20250713|com.finaccel.android|          None|shareit|    ribfvju_int| AF|ribfvju_KREDIVO_A...|    84|       0|              0|  1752461394|0.000000|0.000000|              CPE|\n",
      "+--------+--------------------+--------------+-------+---------------+---+--------------------+------+--------+---------------+------------+--------+--------+-----------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "file_path='/opt/develop/tmp/data/raw-20250714/20250713.gz.parquet'\n",
    "df=spark.read.parquet(file_path)\n",
    "df.printSchema()\n",
    "df.createOrReplaceTempView('t_data')\n",
    "spark.sql('''\n",
    "select * from t_data\n",
    "where pkg_name='com.finaccel.android'\n",
    "''').show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b3afbf3-848c-4a63-9646-77375c88432a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
